%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% This file is part of the book
%%
%% Probability and Its Applications
%% http://code.google.com/p/probability-book/
%%
%% Copyright (C) 2010 Minh Van Nguyen <nguyenminh2@gmail.com>
%%
%% See the file COPYING for copying conditions.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\chapter{Discrete Random Variables}
\index{random variable!discrete}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Random variables and expectation}
\index{random variable}
\index{expectation}

We are often interested in some value associated with a random event
rather than the event itself. Take the example of tossing two
dice\index{dice}: we only want to know the sum of the two dice, not
the outcome of each die.

A \emph{random variable}\index{random variable} $X$ on a sample
space\index{sample space} $\Omega$ is a real-valued function
$X: \Omega \to \R$. A
\emph{discrete random variable}\index{random variable!discrete} is a
random variable that takes only a finite or countably infinite number
of values.

Let $X$ be a random variable and let $a \in \R$. The event
``$X = a$'' represents the set $\{s \in \Omega \mid X(s) = a\}$ and
the probability of this event is written
\[
\Pr(X = a)
=
\sum_{s \in \Omega: X(s) = a} \Pr(s).
\]

Two random variables $X$ and $Y$ are
independent\index{random variable!independent} if
\[
\Pr\big((X = x) \cap (Y = y)\big)
=
\Pr(X = x) \cdot \Pr(Y = y)
\]
for all $x,y \in \R$. Furthermore, the random variables
$X_1, X_2, \dots, X_k$ are
\emph{mutually independent}\index{random variable!mutually independent}
if for any subset $I \subseteq \{1, 2, \dots, k\}$ and any values
$x_i$ for $i \in I$, we have
\[
\Pr\left(\bigcap_{i \in I} X_i = x_i\right)
=
\prod_{i \in I} \Pr(X_i = x_i).
\]

The \emph{expectation}\index{expectation} $\E[X]$ of a discrete random
variable\index{random variable!discrete} $X$ is
\[
\E[X]
=
\sum_i i \cdot \Pr(X = i)
\]
where the sum is taken over all values in the range of $X$. If
$\sum_i |i| \cdot \Pr(X = i)$ converges\index{convergence}, then the
expectation is \emph{finite}\index{expectation!finite}. Otherwise, the
expectation is said to be \emph{unbounded}\index{expectation!unbounded}.

{\color{red}
\begin{itemize}
\item How does expectation compare with the dot product? List
  similarities and differences.
\end{itemize}
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Linearity of expectations}
\index{expectation!linearity}

\begin{theorem}
\textbf{Linearity of expectations.}\index{expectation!linearity}
Let $X_1, X_2, \dots, X_n$ be a finite collection of discrete random
variables\index{random variable!discrete} with finite
expectations. Then
\[
\E\left[\sum_i X_i\right]
=
\sum_i \E[X_i].
\]
\end{theorem}

\begin{proof}
We use induction\index{induction} on the number of random
variables. For the base case, let $X$ and $Y$ be random variables. Use
the law of total probability\index{law of total probability} to get
%%
\begin{align*}
\E[X + Y]
&=
\sum_i \sum_j (i + j) \cdot \Pr\big((X = i) \cap (Y = j)\big) \\
&=
\sum_i \sum_j i \cdot \Pr\big((X = i) \cap (Y = j)\big)
+
\sum_i \sum_j j \cdot \Pr\big((X = i) \cap (Y = j)\big) \\
&=
\sum_i i \sum_j \Pr\big((X = i) \cap (Y = j)\big)
+
\sum_j j \sum_i \Pr\big((X = i) \cap (Y = j)\big) \\
&=
\sum_i i \cdot \Pr(X = i) + \sum_j j \cdot \Pr(Y = j) \\
&=
\E[X] + \E[Y].
\end{align*}
%%
The inductive case is left as an exercise.
\end{proof}

Linearity\index{expectation!linearity} of expectations holds for any
collection of random variables, even if they are not
independent. Furthermore, if $\sum_{i=1}^\infty \E[|X_i|]$
converges\index{convergence}, then it can be shown that
\[
\E\left[\sum_{i=1}^\infty X_i\right]
=
\sum_{i=1}^\infty \E[X_i].
\]

\begin{lemma}
\label{lem:discrete:linearity_constants}
\textbf{Linearity of constants.}\index{linearity!constants}
Let $c$ be any constant and $X$ a random variable. Then
\[
\E[cX]
=
c \cdot \E[X].
\]
\end{lemma}

\begin{proof}
The case $c = 0$ is trivial. Suppose $c \neq 0$. Then
%%
\begin{align*}
\E[cX]
&=
\sum_i i \cdot \Pr(cX = i) \\
&=
c \cdot \sum_i (i/c) \cdot \Pr(X = i/c) \\
&=
c \cdot \sum_k k \cdot \Pr(X = k) \\
&=
c \cdot \E[X]
\end{align*}
%%
as required.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Jensen's inequality}
\index{Jensen's inequality}

We start off by proving that $\E[X^2] \geq (\E[X])^2$. The random
variable $Y = (X - \E[X])^2$ is nonnegative, hence $\E[Y] \geq 0$.
Use the linearity of expectations\index{expectation!linearity} and
Lemma~\ref{lem:discrete:linearity_constants} to obtain
%%
\begin{align*}
0 \leq \E[Y]
&=
\E\big[(X - \E[X])^2\big] \\
&=
\E\big[X^2 - 2X \cdot \E[X] + (\E[X])^2\big] \\
&=
\E[X^2] - 2 \cdot \E\big[X \cdot \E[X]\big] + (\E[X])^2 \\
&=
\E[X^2] - 2 \cdot \E[X] \cdot \E[X] + (\E[X])^2 \\
&=
\E[X^2] - (E[X])^2.
\end{align*}
%%
It follows that $\E[X^2] \geq (\E[X])^2$, an example of a more general
result known as Jensen's inequality\index{Jensen's inequality}.

A function $f: \R \to \R$ is \emph{convex}\index{function!convex} if
for any $x_1, x_2$ and $0 \leq \lambda \leq 1$ we have
\[
f\big(\lambda x_1 + (1 - \lambda)x_2\big)
\leq
\lambda f(x_1) + (1 - \lambda) f(x_2).
\]
An alternative definition is provided by the following lemma.

\begin{lemma}
\label{lem:discrete:convex_function}
\textbf{Convex functions.}\index{function!convex}
Let $f$ be a twice differentiable\index{function!differentiable}
function. Then $f$ is convex if and only if $f''(x) \geq 0$.
\end{lemma}

\begin{theorem}
\label{thm:discrete:Jensen_inequality}
\textbf{Jensen's inequality.}\index{Jensen's inequality}
If $f$ is a convex\index{function!convex} function, then
\[
\E[f(x)]
\geq
f(\E[X]).
\]
\end{theorem}

\begin{proof}
We assume that $f$ has a Taylor expansion\index{Taylor!expansion}. Let
$\mu = \E[X]$. Using Taylor's theorem\index{Taylor!theorem}, there
is some $c \in \R$ such that
%%
\begin{align*}
f(x)
&=
f(\mu) + f'(\mu) \cdot (x - \mu) + \frac{f''(c) \cdot (x - \mu)^2}{2} \\
&\geq
f(\mu) + f'(\mu) \cdot (x - \mu)
\end{align*}
%%
where $f''(x) \geq 0$ by
Lemma~\ref{lem:discrete:convex_function}. Take the expectations of
both sides, apply linearity of
expectations\index{expectation!linearity}, and
Lemma~\ref{lem:discrete:linearity_constants} to produce
%%
\begin{align*}
\E[f(x)]
&\geq
\E[f(\mu) + f'(\mu) \cdot (x - \mu)] \\
&=
\E[f(\mu)] + f'(\mu) \cdot (\E[X] - \mu) \\
&=
f(\mu) \\
&=
f(\E[X])
\end{align*}
%%
and the theorem follows.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Bernoulli and binomial random variables}
\index{random variable!Bernoulli}
\index{random variable!binomial}

Consider an experiment\index{experiment} that
succeeds\index{probability!success} with probability $p$ but
fails\index{probability!failure} with probability $1 - p$. Let $X$ be
a random variable given by
\[
X
=
\begin{cases}
1, & \text{if the experiment succeeds}, \\
0, & \text{otherwise.}
\end{cases}
\]
The random variable can take on one of two possible values: either it
is $1$ or it is zero. Such a ``binary''\index{random variable!binary}
random variable is referred to as a
\emph{Bernoulli}\index{random variable!Bernoulli} or an
\emph{indicator}\index{random variable!indicator} random variable. The
expectation of the Bernoulli\index{random variable!Bernoulli} random
variable $X$ is
%%
\begin{align*}
\E[X]
&=
1 \cdot p + 0 \cdot (1 - p) \\
&=
p \\
&=
\Pr(X = 1).
\end{align*}

Instead of one experiment\index{experiment}, now consider a sequence
of $n$ experiments. Each experiment is
independent\index{experiment!independent} of one another, with
success\index{probability!success} and
failure\index{probability!failure} probabilities $p$ and $1 - p$,
respectively. Let $X$ be the number of
successes\index{experiment!success} in the $n$ experiments. Then $X$
has a \emph{binomial distribution}\index{distribution!binomial}.

Let $B(n,p)$ denote a binomial random
variable\index{random variable!binomial} $X$ with parameters $n$ and
$p$. The binomial random variable\index{random variable!binomial} is
defined by the following probability distribution on
$i = 0, 1, 2, \dots, n$:
%%
\begin{equation}
\label{eq:discrete:binomial_distribution}
\index{distribution!binomial}
\Pr(X = i)
=
\binom{n}{i} p^i (1 - p)^{n-i}.
\end{equation}
%%
The binomial random variable\index{random variable!binomial} $X$
equals $i$ whenever there are exactly $i$ successes and $n - i$
failures in $n$ independent experiments, each having the same
probability of success $p$. The binomial
distribution\index{distribution!binomial}
\eqref{eq:discrete:binomial_distribution} ensures that
\[
\sum_{i=0}^n \Pr(X = i)
=
1
\]
which is a necessary condition for the binomial random
variable\index{random variable!binomial} to qualify as a probability
function\index{probability!function}.

We can use the binomial
identity\index{binomial!theorem}\index{binomial!identity}
\[
(x + y)^n
=
\sum_{i=0}^n \binom{n}{i} x^i y^{n-i}
\]
to compute the expectation of a binomial random
variable\index{random variable!binomial} $X$:
%%
\begin{align*}
\E[X]
&=
\sum_{i=0}^n i \binom{n}{i} p^i (1 - p)^{n-i} \\[4pt]
&=
np \sum_{k=0}^{n-1}
\frac{(n - 1)!}{k! \big((n-1) - k\big)!} p^k (1 - p)^{(n - 1) - k} \\[4pt]
&=
np \sum_{k=0}^{n-1} \binom{n-1}{k} p^k (1-p)^{(n-1) - k} \\[4pt]
&=
np.
\end{align*}
%%
An alternative approach uses the linearity of
expectations\index{expectation!linearity}. Let $X$ be a binomial
random variable\index{random variable!binomial} with parameters $n$
and $p$. Then $X$ counts the number of successes in $n$ trials, where
each trial has the same probability of success $p$. Define a set of
$n$ Bernoulli random variables\index{random variable!Bernoulli}
$X_1, X_2, \dots, X_n$ by
\[
X_i
=
\begin{cases}
1, & \text{if the $i$-th trial is successful,} \\
0, & \text{otherwise.}
\end{cases}
\]
The expectation of each $X_i$ is $p$ and $X = \sum_i X_i$. Conclude by
the linearity of expectations\index{expectation!linearity} that
%%
\begin{align*}
\E[X]
&=
\E\left[\sum_i X_i\right] \\[4pt]
&=
\sum_i \E[X_i] \\
&=
np.
\end{align*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Conditional expectation}
\index{expectation!conditional}

The \emph{conditional expectation}\index{expectation!conditional} of a
random variable $X$ given a random variable $Y$ is
\[
\E[X \mid Y=y]
=
\sum_x x \cdot \Pr(X=x \mid Y=y)
\]
where the sum is taken over all values in the range of $X$. The
conditional expectation\index{expectation!conditional} is, like the
expectation, a weighted sum. Whereas the expectation of a random
variable $X$ weights each value by the probability that $X$ assumes
that value, the conditional expectation\index{expectation!conditional}
of $X$ weights each value by the conditional probability of $X$
assuming that value.

\begin{lemma}
\label{lem:discrete:expectation_indepdent_random_variables}
Let $X$ and $Y$ be independent\index{random variable!independent}
random variables. Then
\[
\E[X]
=
\sum_y \Pr(Y=y) \cdot \E[X \mid Y=y]
\]
where the summation is over all values in the range of $Y$ and all of
the expectations that exist.
\end{lemma}

\begin{proof}
The result follows from the definitions of
independent\index{random variable!independent} random variables and
conditional\index{expectation!conditional} expectations:
%%
\begin{align*}
\sum_y \Pr(Y = y) \cdot \E[X \mid Y=y]
&=
\sum_y \Pr(Y = y) \cdot \sum_x x \cdot \Pr(X=x \mid Y=y) \\
&=
\sum_x \sum_y x \cdot \Pr(X=x \mid Y=y) \cdot \Pr(Y = y) \\
&=
\sum_x \sum_y x \cdot \Pr(X = x) \cdot \Pr(Y = y) \\
&=
\sum_x x \cdot \Pr(X = x) \cdot \sum_y \Pr(Y = y) \\
&=
\sum_x x \cdot \Pr(X = x).
\end{align*}
%%
Finally, use the definition of expectation to obtain
$\sum_y \Pr(Y = y) \cdot \E[X \mid Y=y] = \E[X]$ as required.
\end{proof}

\begin{lemma}
Let $X_1, X_2, \dots, X_n$ be a finite collection of discrete random
variables\index{random variable!discrete} with
finite\index{expectation!finite} expectations. For any random variable
$Y$, we have
\[
\E\left[\left. \sum_i X_i \;\right| Y=y \right]
=
\sum_i \E[X_i \mid Y=y].
\]
\end{lemma}

The following definition requires some thought in order to absorb its
meaning. For two random variables $X$ and $Y$, the expression
$\E[X \mid Y]$ is a random variable $f(Y)$ that assumes the value
$\E[X \mid Y=y]$ when $Y = y$. The expression $\E[X \mid Y]$ is not a
real value, but a function $f(Y)$ of the random variable $Y$ given by
%%
\begin{align*}
f: \Omega &\to \R \\
Y=y &\mapsto \E[X \mid Y=y].
\end{align*}
%%
Considering that $\E[X \mid Y]$ is a random variable, how are we to
compute its expectation $\E\big[\E[X \mid Y]\big]$?

\begin{theorem}
The expectation of the random variable $\E[X \mid Y]$ is
\[
\E[X]
=
\E\big[\E[X \mid Y]\big].
\]
\end{theorem}

\begin{proof}
We know that $\E[X \mid Y]$ is the random variable $f(Y)$ that takes
on the value $\E[X \mid Y=y]$ when $Y = y$. Use the definition of
expectation and
Lemma~\ref{lem:discrete:expectation_indepdent_random_variables} to get
%%
\begin{align*}
\E\big[\E[X \mid Y]\big]
&=
\sum_y \E[X \mid Y=y] \cdot \Pr(Y = y) \\
&=
\E[Y]
\end{align*}
%%
as required.
\end{proof}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Geometric distribution}
\index{distribution!geometric}

The binomial distribution\index{distribution!binomial} deals with a
sequence of independent\index{experiment!independent} experiments,
each with the same probability of success\index{probability!success},
and provides us with a distribution of
successes\index{distribution!success}. Instead of performing $n$
experiments\index{experiment} and then count the total successes, we now
consider a slightly different situation. We repeat an experiment until
we obtain the first success. The sequence of
experiments\index{experiment} up to and including the first success
has the same success\index{probability!success} probability $p$, and
each trial is independent of the others. To find out how many
trials\index{experiment!trial} are required, we consider the
distribution of the number of
trials\index{distribution!trial}. Letting $X$ represents the number of
trials up to and including the trial resulting in the first success,
we have a \emph{geometric distribution}\index{distribution!geometric}
with geometric random variable\index{random variable!geometric} $X$
and parameter $p$. That is, $X$ is given by the following geometric
distribution\index{distribution!geometric}
\[
\Pr(X = n)
=
(1 - p)^{n-1} p
\]
for $n = 1, 2, \dots$.

In a sequence $E_1, E_2, \dots, E_n$ of $n$
independent\index{experiment!independent} experiments each with the
same probability of success\index{probability!success} $p$, let $E_i$
results in failure for $1 \leq i < n$ and $E_n$ results in the first
success. For the geometric random
variable\index{random variable!geometric} $X$ to equal $n$, there must
be $n - 1$ failures followed by a success. It can be shown that $X$
satisfies
\[
\sum_{n \geq 1} \Pr(X = n)
=
1
\]
a condition necessary for the geometric random
variable\index{random variable!geometric} to qualify as a probability
function\index{probability!function}.

\begin{lemma}
\label{lem:discrete:memoryless_geometric_random_variable}
If $X$ is a geometric random variable\index{random variable!geometric}
with parameter $p$, then for $n > 0$ we have
\[
\Pr(X = n+k \mid X > k)
=
\Pr(X = n).
\]
\end{lemma}

\begin{proof}
If $x$ is such that $0 < x < 1$, then it can be shown that
\[
\sum_{i=k}^\infty x^i
=
\frac{x^k}{1 - x}.
\]
Use the latter result and the definition of
conditional\index{probability!conditional} probability to obtain
%%
\begin{align*}
\Pr(X = n+k \mid X > k)
&=
\frac{\Pr\big((X = n+k) \cap (X > k)\big)} {\Pr(X > k)} \\[4pt]
&=
\frac{\Pr(X = n+k)}{\Pr(X > k)} \\[4pt]
&=
\frac{(1 - p)^{n+k-1} p} {\sum_{i=k}^\infty (1 - p)^i p} \\[4pt]
&=
\frac{(1 - p)^{n+k-1} p}{(1 - p)^k} \\[4pt]
&=
(1 - p)^{n-1} p \\
&=
\Pr(X = n)
\end{align*}
%%
as required.
\end{proof}

Lemma~\ref{lem:discrete:memoryless_geometric_random_variable} can be
interpreted as saying that geometric random
variables\index{random variable!geometric} are
``memoryless''\index{memoryless}. Regardless of how many failures we
have experienced, this has no bearing on the probability of reaching
the first success in $n$ trials from now.

\begin{lemma}
\label{lem:discrete:expectation_nonnegative_discrete_random_variable}
If $X$ is a discrete random variable\index{random variable!discrete}
that takes on only nonnegative integer values, then
\[
\E[X]
=
\sum_{i=1}^\infty \Pr(X \geq i).
\]
\end{lemma}

\begin{proof}
This is a straightforward application of
double\index{double summation} summation:
%%
\begin{align*}
\sum_{i=1}^\infty \Pr(X \geq i)
&=
\sum_{i=1}^\infty \sum_{j=i}^\infty \Pr(X = j) \\
&=
\sum_{j=1}^\infty \sum_{i=1}^j \Pr(X = j) \\
&=
\sum_{j=1}^\infty j \cdot \Pr(X = j) \\
&=
\E[X]
\end{align*}
%%
and the lemma follows from the above chain of equality.
\end{proof}

If $X$ is a geometric random variable\index{random variable!geometric}
with parameter $p$, then
%%
\begin{align*}
\Pr(X \geq i)
&=
\sum_{n=i}^\infty (1 - p)^{n-1} p \\
&=
(1 - p)^{i-1}
\end{align*}
%%
where the last equality uses the fact that for $0 < x < 1$ we have
$\sum_{i=k}^\infty x^i = x^k / (1-x)$. Use
Lemma~\ref{lem:discrete:expectation_nonnegative_discrete_random_variable}
to get
%%
\begin{align*}
\E[X]
&=
\sum_{i=1}^\infty \Pr(X \geq i) \\
&=
\sum_{i=1}^\infty (1 - p)^{i-1} \\
&=
\frac{1}{1-p} \sum_{i=1}^\infty (1 - p)^i \\
&=
\frac{1}{p}.
\end{align*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Problems}

\begin{problem}
\item For any random variable $X$ and any even integer $k \geq 2$,
  show that $\E[X^k] = \E[X]^k$.

\item Let $x_1, \dots, x_n$ and $\lambda_1, \dots, \lambda_n$ be real
  numbers such that $\sum_i \lambda_i = 1$. If $f: \R \to \R$, show
  that
  \[
  f\left(\sum_i \lambda_i x_i\right)
  \leq
  \sum_i \lambda_i f(x_i).
  \]
  If $X$ is a random variable that assumes finitely many values, use
  the latter inequality to show that
  \[
  \E[f(x)]
  \geq
  f(\E[X]).
  \]

\item Let $X_1, X_2, \dots$ be an
  infinite\index{random variable!infinite sequence} sequence of random
  variables such that the infinite series
  \[
  \sum_{i=1}^\infty \E[|X_i|]
  \]
  converges. Show that
  \[
  \E\left[\sum_{i=1}^\infty X_i\right]
  =
  \sum_{i=1}^\infty \E[X_i].
  \]
\end{problem}
